#include "arm_arch.h"

#if defined(__thumb2__)
.syntax	unified
.thumb
#else
.code	32
#endif

.text

.type	iotas32, %object
.align	5
iotas32:
.long	0x00000001, 0x00000000
.long	0x00000000, 0x00000089
.long	0x00000000, 0x8000008b
.long	0x00000000, 0x80008080
.long	0x00000001, 0x0000008b
.long	0x00000001, 0x00008000
.long	0x00000001, 0x80008088
.long	0x00000001, 0x80000082
.long	0x00000000, 0x0000000b
.long	0x00000000, 0x0000000a
.long	0x00000001, 0x00008082
.long	0x00000000, 0x00008003
.long	0x00000001, 0x0000808b
.long	0x00000001, 0x8000000b
.long	0x00000001, 0x8000008a
.long	0x00000001, 0x80000081
.long	0x00000000, 0x80000081
.long	0x00000000, 0x80000008
.long	0x00000000, 0x00000083
.long	0x00000000, 0x80008003
.long	0x00000001, 0x80008088
.long	0x00000000, 0x80000088
.long	0x00000001, 0x00008000
.long	0x00000000, 0x80008082
.size	iotas32,.-iotas32

.type	KeccakF1600_int, %function
.align	5
KeccakF1600_int:
	add	r9,sp,#176
	add	r12,sp,#0
	add	r10,sp,#40
	ldmia	r9,{r4,r5,r6,r7,r8,r9}		@ A[4][2..4]
KeccakF1600_enter:
	str	lr,[sp,#440]
	eor	r11,r11,r11
	str	r11,[sp,#444]
	b	.Lround2x

.align	4
.Lround2x:
	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
#ifdef	__thumb2__
	eor	r0,r0,r10
	eor	r1,r1,r11
	eor	r2,r2,r12
	ldrd	r10,r11,[sp,#56]
	eor	r3,r3,r14
	ldrd	r12,r14,[sp,#64]
	eor	r4,r4,r10
	eor	r5,r5,r11
	eor	r6,r6,r12
	ldrd	r10,r11,[sp,#72]
	eor	r7,r7,r14
	ldrd	r12,r14,[sp,#80]
	eor	r8,r8,r10
	eor	r9,r9,r11
	eor	r0,r0,r12
	ldrd	r10,r11,[sp,#88]
	eor	r1,r1,r14
	ldrd	r12,r14,[sp,#96]
	eor	r2,r2,r10
	eor	r3,r3,r11
	eor	r4,r4,r12
	ldrd	r10,r11,[sp,#104]
	eor	r5,r5,r14
	ldrd	r12,r14,[sp,#112]
	eor	r6,r6,r10
	eor	r7,r7,r11
	eor	r8,r8,r12
	ldrd	r10,r11,[sp,#120]
	eor	r9,r9,r14
	ldrd	r12,r14,[sp,#128]
	eor	r0,r0,r10
	eor	r1,r1,r11
	eor	r2,r2,r12
	ldrd	r10,r11,[sp,#136]
	eor	r3,r3,r14
	ldrd	r12,r14,[sp,#144]
	eor	r4,r4,r10
	eor	r5,r5,r11
	eor	r6,r6,r12
	ldrd	r10,r11,[sp,#152]
	eor	r7,r7,r14
	ldrd	r12,r14,[sp,#160]
	eor	r8,r8,r10
	eor	r9,r9,r11
	eor	r0,r0,r12
	ldrd	r10,r11,[sp,#168]
	eor	r1,r1,r14
	ldrd	r12,r14,[sp,#16]
	eor	r2,r2,r10
	eor	r3,r3,r11
	eor	r4,r4,r12
	ldrd	r10,r11,[sp,#24]
	eor	r5,r5,r14
	ldrd	r12,r14,[sp,#32]
#else
	eor	r0,r0,r10
	add	r10,sp,#56
	eor	r1,r1,r11
	eor	r2,r2,r12
	eor	r3,r3,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
	eor	r4,r4,r10
	add	r10,sp,#72
	eor	r5,r5,r11
	eor	r6,r6,r12
	eor	r7,r7,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
	eor	r8,r8,r10
	add	r10,sp,#88
	eor	r9,r9,r11
	eor	r0,r0,r12
	eor	r1,r1,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
	eor	r2,r2,r10
	add	r10,sp,#104
	eor	r3,r3,r11
	eor	r4,r4,r12
	eor	r5,r5,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
	eor	r6,r6,r10
	add	r10,sp,#120
	eor	r7,r7,r11
	eor	r8,r8,r12
	eor	r9,r9,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
	eor	r0,r0,r10
	add	r10,sp,#136
	eor	r1,r1,r11
	eor	r2,r2,r12
	eor	r3,r3,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
	eor	r4,r4,r10
	add	r10,sp,#152
	eor	r5,r5,r11
	eor	r6,r6,r12
	eor	r7,r7,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
	eor	r8,r8,r10
	ldr	r10,[sp,#168]		@ A[4][1]
	eor	r9,r9,r11
	ldr	r11,[sp,#168+4]
	eor	r0,r0,r12
	ldr	r12,[sp,#16]		@ A[0][2]
	eor	r1,r1,r14
	ldr	r14,[sp,#16+4]
	eor	r2,r2,r10
	add	r10,sp,#24
	eor	r3,r3,r11
	eor	r4,r4,r12
	eor	r5,r5,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
#endif
	eor	r6,r6,r10
	eor	r7,r7,r11
	eor	r8,r8,r12
	eor	r9,r9,r14

	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
#ifndef	__thumb2__
	str	r10,[sp,#208]		@ D[1] = E[0]
#endif
	eor	r11,r1,r4
#ifndef	__thumb2__
	str	r11,[sp,#208+4]
#else
	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
#endif
	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
	eor	r14,r7,r0
#ifndef	__thumb2__
	str	r12,[sp,#232]		@ D[4] = E[1]
#endif
	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
#ifndef	__thumb2__
	str	r14,[sp,#232+4]
#else
	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
#endif
	eor	r1,r9,r2
#ifndef	__thumb2__
	str	r0,[sp,#200]		@ D[0] = C[0]
#endif
	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
#ifndef	__thumb2__
	ldr	r7,[sp,#144]
#endif
	eor	r3,r3,r6
#ifndef	__thumb2__
	str	r1,[sp,#200+4]
#else
	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
#endif
#ifndef	__thumb2__
	ldr	r6,[sp,#144+4]
#else
	ldrd	r7,r6,[sp,#144]
#endif
#ifndef	__thumb2__
	str	r2,[sp,#216]		@ D[2] = C[1]
#endif
	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
#ifndef	__thumb2__
	str	r3,[sp,#216+4]
#else
	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
#endif
	eor	r5,r5,r8

#ifndef	__thumb2__
	ldr	r8,[sp,#192]
#endif
#ifndef	__thumb2__
	ldr	r9,[sp,#192+4]
#else
	ldrd	r8,r9,[sp,#192]
#endif
#ifndef	__thumb2__
	str	r4,[sp,#224]		@ D[3] = C[2]
#endif
	eor	r7,r7,r4
#ifndef	__thumb2__
	str	r5,[sp,#224+4]
#else
	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
#endif
	eor	r6,r6,r5
#ifndef	__thumb2__
	ldr	r4,[sp,#0]
#endif
	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
	@ mov	r6,r6,ror#32-11
#ifndef	__thumb2__
	ldr	r5,[sp,#0+4]
#else
	ldrd	r4,r5,[sp,#0]
#endif
	eor	r8,r8,r12
	eor	r9,r9,r14
#ifndef	__thumb2__
	ldr	r12,[sp,#96]
#endif
	eor	r0,r0,r4
#ifndef	__thumb2__
	ldr	r14,[sp,#96+4]
#else
	ldrd	r12,r14,[sp,#96]
#endif
	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
	@ mov	r9,r9,ror#32-7
	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
	eor	r12,r12,r2
#ifndef	__thumb2__
	ldr	r2,[sp,#48]
#endif
	eor	r14,r14,r3
#ifndef	__thumb2__
	ldr	r3,[sp,#48+4]
#else
	ldrd	r2,r3,[sp,#48]
#endif
	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
	ldr	r12,[sp,#444]			@ load counter
	eor	r2,r2,r10
	adr	r10,iotas32
	mov	r4,r14,ror#32-22
	add	r14,r10,r12
	eor	r3,r3,r11
	ldmia	r14,{r10,r11}		@ iotas[i]
	bic	r12,r4,r2,ror#32-22
	bic	r14,r5,r3,ror#32-22
	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
	mov	r3,r3,ror#32-22
	eor	r12,r12,r0
	eor	r14,r14,r1
	eor	r10,r10,r12
	eor	r11,r11,r14
#ifndef	__thumb2__
	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
#endif
	bic	r12,r6,r4,ror#11
#ifndef	__thumb2__
	str	r11,[sp,#240+4]
#else
	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
#endif
	bic	r14,r7,r5,ror#10
	bic	r10,r8,r6,ror#32-(11-7)
	bic	r11,r9,r7,ror#32-(10-7)
	eor	r12,r2,r12,ror#32-11
#ifndef	__thumb2__
	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r3,r14,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#248+4]
#else
	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r10,r4,r10,ror#32-7
	eor	r11,r5,r11,ror#32-7
#ifndef	__thumb2__
	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r12,r0,r8,ror#32-7
#ifndef	__thumb2__
	str	r11,[sp,#256+4]
#else
	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r14,r1,r9,ror#32-7
	eor	r12,r12,r6,ror#32-11
#ifndef	__thumb2__
	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
#endif
	eor	r14,r14,r7,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#264+4]
#else
	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r10,r2,r0
	add	r14,sp,#224
#ifndef	__thumb2__
	ldr	r0,[sp,#24]		@ A[0][3]
#endif
	bic	r11,r3,r1
#ifndef	__thumb2__
	ldr	r1,[sp,#24+4]
#else
	ldrd	r0,r1,[sp,#24]		@ A[0][3]
#endif
	eor	r10,r10,r8,ror#32-7
	eor	r11,r11,r9,ror#32-7
#ifndef	__thumb2__
	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif
	add	r9,sp,#200
#ifndef	__thumb2__
	str	r11,[sp,#272+4]
#else
	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif

	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]

#ifndef	__thumb2__
	ldr	r2,[sp,#72]		@ A[1][4]
#endif
	eor	r0,r0,r10
#ifndef	__thumb2__
	ldr	r3,[sp,#72+4]
#else
	ldrd	r2,r3,[sp,#72]		@ A[1][4]
#endif
	eor	r1,r1,r11
	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
#ifndef	__thumb2__
	ldr	r10,[sp,#128]		@ A[3][1]
#endif
	@ mov	r1,r1,ror#32-14
#ifndef	__thumb2__
	ldr	r11,[sp,#128+4]
#else
	ldrd	r10,r11,[sp,#128]		@ A[3][1]
#endif

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r4,[sp,#80]		@ A[2][0]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r5,[sp,#80+4]
#else
	ldrd	r4,r5,[sp,#80]		@ A[2][0]
#endif
	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
	@ mov	r3,r3,ror#32-10

	eor	r6,r6,r4
#ifndef	__thumb2__
	ldr	r12,[sp,#216]		@ D[2]
#endif
	eor	r7,r7,r5
#ifndef	__thumb2__
	ldr	r14,[sp,#216+4]
#else
	ldrd	r12,r14,[sp,#216]		@ D[2]
#endif
	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
	mov	r4,r7,ror#32-2

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#176]		@ A[4][2]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#176+4]
#else
	ldrd	r8,r9,[sp,#176]		@ A[4][2]
#endif
	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
	mov	r6,r11,ror#32-23

	bic	r10,r4,r2,ror#32-10
	bic	r11,r5,r3,ror#32-10
	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
	mov	r8,r14,ror#32-31
	eor	r10,r10,r0,ror#32-14
	eor	r11,r11,r1,ror#32-14
#ifndef	__thumb2__
	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
#ifndef	__thumb2__
	str	r11,[sp,#280+4]
#else
	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r14,r7,r5
	eor	r12,r12,r2,ror#32-10
#ifndef	__thumb2__
	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r14,r3,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#288+4]
#else
	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
	bic	r12,r0,r8,ror#14
	bic	r14,r1,r9,ror#14
	eor	r10,r10,r4
	eor	r11,r11,r5
#ifndef	__thumb2__
	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r2,r2,r0,ror#32-(14-10)
#ifndef	__thumb2__
	str	r11,[sp,#296+4]
#else
	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
#endif
	eor	r12,r6,r12,ror#32-14
	bic	r11,r3,r1,ror#32-(14-10)
#ifndef	__thumb2__
	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
#endif
	eor	r14,r7,r14,ror#32-14
#ifndef	__thumb2__
	str	r14,[sp,#304+4]
#else
	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
#endif
	add	r12,sp,#208
#ifndef	__thumb2__
	ldr	r1,[sp,#8]		@ A[0][1]
#endif
	eor	r10,r8,r2,ror#32-10
#ifndef	__thumb2__
	ldr	r0,[sp,#8+4]
#else
	ldrd	r1,r0,[sp,#8]		@ A[0][1]
#endif
	eor	r11,r9,r11,ror#32-10
#ifndef	__thumb2__
	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r11,[sp,#312+4]
#else
	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif

	add	r9,sp,#224
	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
#ifndef	__thumb2__
	ldr	r2,[sp,#56]		@ A[1][2]
#endif
#ifndef	__thumb2__
	ldr	r3,[sp,#56+4]
#else
	ldrd	r2,r3,[sp,#56]		@ A[1][2]
#endif
	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]

	eor	r1,r1,r10
#ifndef	__thumb2__
	ldr	r4,[sp,#104]		@ A[2][3]
#endif
	eor	r0,r0,r11
#ifndef	__thumb2__
	ldr	r5,[sp,#104+4]
#else
	ldrd	r4,r5,[sp,#104]		@ A[2][3]
#endif
	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r10,[sp,#152]		@ A[3][4]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r11,[sp,#152+4]
#else
	ldrd	r10,r11,[sp,#152]		@ A[3][4]
#endif
	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
#ifndef	__thumb2__
	ldr	r12,[sp,#200]		@ D[0]
#endif
	@ mov	r3,r3,ror#32-3
#ifndef	__thumb2__
	ldr	r14,[sp,#200+4]
#else
	ldrd	r12,r14,[sp,#200]		@ D[0]
#endif

	eor	r4,r4,r6
	eor	r5,r5,r7
	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
	@ mov	r4,r7,ror#32-13		@ [track reverse order below]

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#160]		@ A[4][0]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#160+4]
#else
	ldrd	r8,r9,[sp,#160]		@ A[4][0]
#endif
	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
	mov	r7,r11,ror#32-4

	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
	mov	r9,r14,ror#32-9

	bic	r10,r5,r2,ror#13-3
	bic	r11,r4,r3,ror#12-3
	bic	r12,r6,r5,ror#32-13
	bic	r14,r7,r4,ror#32-12
	eor	r10,r0,r10,ror#32-13
	eor	r11,r1,r11,ror#32-12
#ifndef	__thumb2__
	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r12,r12,r2,ror#32-3
#ifndef	__thumb2__
	str	r11,[sp,#320+4]
#else
	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r14,r14,r3,ror#32-3
#ifndef	__thumb2__
	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
#ifndef	__thumb2__
	str	r14,[sp,#328+4]
#else
	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r10,r10,r5,ror#32-13
	eor	r11,r11,r4,ror#32-12
#ifndef	__thumb2__
	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r12,r0,r8
#ifndef	__thumb2__
	str	r11,[sp,#336+4]
#else
	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r14,r1,r9
	eor	r12,r12,r6
	eor	r14,r14,r7
#ifndef	__thumb2__
	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r10,r2,r0,ror#3
#ifndef	__thumb2__
	str	r14,[sp,#344+4]
#else
	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r11,r3,r1,ror#3
#ifndef	__thumb2__
	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
#endif
	eor	r10,r8,r10,ror#32-3
#ifndef	__thumb2__
	ldr	r0,[sp,#32+4]
#else
	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
#endif
	eor	r11,r9,r11,ror#32-3
#ifndef	__thumb2__
	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif
	add	r9,sp,#208
#ifndef	__thumb2__
	str	r11,[sp,#352+4]
#else
	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif

#ifndef	__thumb2__
	ldr	r10,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r11,[sp,#232+4]
#else
	ldrd	r10,r11,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r12,[sp,#200]		@ D[0]
#endif
#ifndef	__thumb2__
	ldr	r14,[sp,#200+4]
#else
	ldrd	r12,r14,[sp,#200]		@ D[0]
#endif

	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]

	eor	r1,r1,r10
#ifndef	__thumb2__
	ldr	r2,[sp,#40]		@ A[1][0]
#endif
	eor	r0,r0,r11
#ifndef	__thumb2__
	ldr	r3,[sp,#40+4]
#else
	ldrd	r2,r3,[sp,#40]		@ A[1][0]
#endif
	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
#ifndef	__thumb2__
	ldr	r4,[sp,#88]		@ A[2][1]
#endif
	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
#ifndef	__thumb2__
	ldr	r5,[sp,#88+4]
#else
	ldrd	r4,r5,[sp,#88]		@ A[2][1]
#endif

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r10,[sp,#136]		@ A[3][2]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r11,[sp,#136+4]
#else
	ldrd	r10,r11,[sp,#136]		@ A[3][2]
#endif
	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
#ifndef	__thumb2__
	ldr	r12,[sp,#224]		@ D[3]
#endif
	@ mov	r3,r3,ror#32-18
#ifndef	__thumb2__
	ldr	r14,[sp,#224+4]
#else
	ldrd	r12,r14,[sp,#224]		@ D[3]
#endif

	eor	r6,r6,r4
	eor	r7,r7,r5
	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
	mov	r5,r7,ror#32-5

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#184]		@ A[4][3]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#184+4]
#else
	ldrd	r8,r9,[sp,#184]		@ A[4][3]
#endif
	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
	mov	r6,r11,ror#32-8

	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
	mov	r9,r14,ror#32-28

	bic	r10,r4,r2,ror#32-18
	bic	r11,r5,r3,ror#32-18
	eor	r10,r10,r0,ror#32-14
	eor	r11,r11,r1,ror#32-13
#ifndef	__thumb2__
	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
#ifndef	__thumb2__
	str	r11,[sp,#360+4]
#else
	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r14,r7,r5
	eor	r12,r12,r2,ror#32-18
#ifndef	__thumb2__
	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r14,r3,ror#32-18
#ifndef	__thumb2__
	str	r14,[sp,#368+4]
#else
	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
	bic	r12,r0,r8,ror#14
	bic	r14,r1,r9,ror#13
	eor	r10,r10,r4
	eor	r11,r11,r5
#ifndef	__thumb2__
	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r2,r2,r0,ror#18-14
#ifndef	__thumb2__
	str	r11,[sp,#376+4]
#else
	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
#endif
	eor	r12,r6,r12,ror#32-14
	bic	r11,r3,r1,ror#18-13
	eor	r14,r7,r14,ror#32-13
#ifndef	__thumb2__
	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
#endif
#ifndef	__thumb2__
	str	r14,[sp,#384+4]
#else
	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
#endif
	add	r14,sp,#216
#ifndef	__thumb2__
	ldr	r0,[sp,#16]		@ A[0][2]
#endif
	eor	r10,r8,r2,ror#32-18
#ifndef	__thumb2__
	ldr	r1,[sp,#16+4]
#else
	ldrd	r0,r1,[sp,#16]		@ A[0][2]
#endif
	eor	r11,r9,r11,ror#32-18
#ifndef	__thumb2__
	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r11,[sp,#392+4]
#else
	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif

	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
#ifndef	__thumb2__
	ldr	r2,[sp,#64]		@ A[1][3]
#endif
#ifndef	__thumb2__
	ldr	r3,[sp,#64+4]
#else
	ldrd	r2,r3,[sp,#64]		@ A[1][3]
#endif
#ifndef	__thumb2__
	ldr	r6,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r7,[sp,#232+4]
#else
	ldrd	r6,r7,[sp,#232]		@ D[4]
#endif

	eor	r0,r0,r10
#ifndef	__thumb2__
	ldr	r4,[sp,#112]		@ A[2][4]
#endif
	eor	r1,r1,r11
#ifndef	__thumb2__
	ldr	r5,[sp,#112+4]
#else
	ldrd	r4,r5,[sp,#112]		@ A[2][4]
#endif
	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
#ifndef	__thumb2__
	ldr	r8,[sp,#200]		@ D[0]
#endif
	@ mov	r1,r1,ror#32-31
#ifndef	__thumb2__
	ldr	r9,[sp,#200+4]
#else
	ldrd	r8,r9,[sp,#200]		@ D[0]
#endif

	eor	r12,r12,r2
#ifndef	__thumb2__
	ldr	r10,[sp,#120]		@ A[3][0]
#endif
	eor	r14,r14,r3
#ifndef	__thumb2__
	ldr	r11,[sp,#120+4]
#else
	ldrd	r10,r11,[sp,#120]		@ A[3][0]
#endif
	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
#ifndef	__thumb2__
	ldr	r12,[sp,#208]		@ D[1]
#endif
	mov	r2,r14,ror#32-28
#ifndef	__thumb2__
	ldr	r14,[sp,#208+4]
#else
	ldrd	r12,r14,[sp,#208]		@ D[1]
#endif

	eor	r6,r6,r4
	eor	r7,r7,r5
	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
	mov	r4,r7,ror#32-20

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#168]		@ A[4][1]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#168+4]
#else
	ldrd	r8,r9,[sp,#168]		@ A[4][1]
#endif
	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
	mov	r6,r11,ror#32-21

	eor	r8,r8,r12
	eor	r9,r9,r14
	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
	@ mov	r9,r3,ror#32-1

	bic	r10,r4,r2
	bic	r11,r5,r3
	eor	r10,r10,r0,ror#32-31
#ifndef	__thumb2__
	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r11,r11,r1,ror#32-31
#ifndef	__thumb2__
	str	r11,[sp,#400+4]
#else
	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
	bic	r14,r7,r5
	eor	r12,r12,r2
	eor	r14,r14,r3
#ifndef	__thumb2__
	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6,ror#1
#ifndef	__thumb2__
	str	r14,[sp,#408+4]
#else
	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r11,r9,r7,ror#1
	bic	r12,r0,r8,ror#31-1
	bic	r14,r1,r9,ror#31-1
	eor	r4,r4,r10,ror#32-1
#ifndef	__thumb2__
	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
#endif
	eor	r5,r5,r11,ror#32-1
#ifndef	__thumb2__
	str	r5,[sp,#416+4]
#else
	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
#endif
	eor	r6,r6,r12,ror#32-31
	eor	r7,r7,r14,ror#32-31
#ifndef	__thumb2__
	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
#endif
	bic	r10,r2,r0,ror#32-31
#ifndef	__thumb2__
	str	r7,[sp,#424+4]
#else
	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
#endif
	bic	r11,r3,r1,ror#32-31
	add	r12,sp,#240
	eor	r8,r10,r8,ror#32-1
	add	r10,sp,#280
	eor	r9,r11,r9,ror#32-1
#ifndef	__thumb2__
	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r9,[sp,#432+4]
#else
	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
#endif
	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
#ifdef	__thumb2__
	eor	r0,r0,r10
	eor	r1,r1,r11
	eor	r2,r2,r12
	ldrd	r10,r11,[sp,#296]
	eor	r3,r3,r14
	ldrd	r12,r14,[sp,#304]
	eor	r4,r4,r10
	eor	r5,r5,r11
	eor	r6,r6,r12
	ldrd	r10,r11,[sp,#312]
	eor	r7,r7,r14
	ldrd	r12,r14,[sp,#320]
	eor	r8,r8,r10
	eor	r9,r9,r11
	eor	r0,r0,r12
	ldrd	r10,r11,[sp,#328]
	eor	r1,r1,r14
	ldrd	r12,r14,[sp,#336]
	eor	r2,r2,r10
	eor	r3,r3,r11
	eor	r4,r4,r12
	ldrd	r10,r11,[sp,#344]
	eor	r5,r5,r14
	ldrd	r12,r14,[sp,#352]
	eor	r6,r6,r10
	eor	r7,r7,r11
	eor	r8,r8,r12
	ldrd	r10,r11,[sp,#360]
	eor	r9,r9,r14
	ldrd	r12,r14,[sp,#368]
	eor	r0,r0,r10
	eor	r1,r1,r11
	eor	r2,r2,r12
	ldrd	r10,r11,[sp,#376]
	eor	r3,r3,r14
	ldrd	r12,r14,[sp,#384]
	eor	r4,r4,r10
	eor	r5,r5,r11
	eor	r6,r6,r12
	ldrd	r10,r11,[sp,#392]
	eor	r7,r7,r14
	ldrd	r12,r14,[sp,#400]
	eor	r8,r8,r10
	eor	r9,r9,r11
	eor	r0,r0,r12
	ldrd	r10,r11,[sp,#408]
	eor	r1,r1,r14
	ldrd	r12,r14,[sp,#256]
	eor	r2,r2,r10
	eor	r3,r3,r11
	eor	r4,r4,r12
	ldrd	r10,r11,[sp,#264]
	eor	r5,r5,r14
	ldrd	r12,r14,[sp,#272]
#else
	eor	r0,r0,r10
	add	r10,sp,#296
	eor	r1,r1,r11
	eor	r2,r2,r12
	eor	r3,r3,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
	eor	r4,r4,r10
	add	r10,sp,#312
	eor	r5,r5,r11
	eor	r6,r6,r12
	eor	r7,r7,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
	eor	r8,r8,r10
	add	r10,sp,#328
	eor	r9,r9,r11
	eor	r0,r0,r12
	eor	r1,r1,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
	eor	r2,r2,r10
	add	r10,sp,#344
	eor	r3,r3,r11
	eor	r4,r4,r12
	eor	r5,r5,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
	eor	r6,r6,r10
	add	r10,sp,#360
	eor	r7,r7,r11
	eor	r8,r8,r12
	eor	r9,r9,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
	eor	r0,r0,r10
	add	r10,sp,#376
	eor	r1,r1,r11
	eor	r2,r2,r12
	eor	r3,r3,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
	eor	r4,r4,r10
	add	r10,sp,#392
	eor	r5,r5,r11
	eor	r6,r6,r12
	eor	r7,r7,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
	eor	r8,r8,r10
	ldr	r10,[sp,#408]		@ A[4][1]
	eor	r9,r9,r11
	ldr	r11,[sp,#408+4]
	eor	r0,r0,r12
	ldr	r12,[sp,#256]		@ A[0][2]
	eor	r1,r1,r14
	ldr	r14,[sp,#256+4]
	eor	r2,r2,r10
	add	r10,sp,#264
	eor	r3,r3,r11
	eor	r4,r4,r12
	eor	r5,r5,r14
	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
#endif
	eor	r6,r6,r10
	eor	r7,r7,r11
	eor	r8,r8,r12
	eor	r9,r9,r14

	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
#ifndef	__thumb2__
	str	r10,[sp,#208]		@ D[1] = E[0]
#endif
	eor	r11,r1,r4
#ifndef	__thumb2__
	str	r11,[sp,#208+4]
#else
	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
#endif
	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
	eor	r14,r7,r0
#ifndef	__thumb2__
	str	r12,[sp,#232]		@ D[4] = E[1]
#endif
	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
#ifndef	__thumb2__
	str	r14,[sp,#232+4]
#else
	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
#endif
	eor	r1,r9,r2
#ifndef	__thumb2__
	str	r0,[sp,#200]		@ D[0] = C[0]
#endif
	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
#ifndef	__thumb2__
	ldr	r7,[sp,#384]
#endif
	eor	r3,r3,r6
#ifndef	__thumb2__
	str	r1,[sp,#200+4]
#else
	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
#endif
#ifndef	__thumb2__
	ldr	r6,[sp,#384+4]
#else
	ldrd	r7,r6,[sp,#384]
#endif
#ifndef	__thumb2__
	str	r2,[sp,#216]		@ D[2] = C[1]
#endif
	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
#ifndef	__thumb2__
	str	r3,[sp,#216+4]
#else
	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
#endif
	eor	r5,r5,r8

#ifndef	__thumb2__
	ldr	r8,[sp,#432]
#endif
#ifndef	__thumb2__
	ldr	r9,[sp,#432+4]
#else
	ldrd	r8,r9,[sp,#432]
#endif
#ifndef	__thumb2__
	str	r4,[sp,#224]		@ D[3] = C[2]
#endif
	eor	r7,r7,r4
#ifndef	__thumb2__
	str	r5,[sp,#224+4]
#else
	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
#endif
	eor	r6,r6,r5
#ifndef	__thumb2__
	ldr	r4,[sp,#240]
#endif
	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
	@ mov	r6,r6,ror#32-11
#ifndef	__thumb2__
	ldr	r5,[sp,#240+4]
#else
	ldrd	r4,r5,[sp,#240]
#endif
	eor	r8,r8,r12
	eor	r9,r9,r14
#ifndef	__thumb2__
	ldr	r12,[sp,#336]
#endif
	eor	r0,r0,r4
#ifndef	__thumb2__
	ldr	r14,[sp,#336+4]
#else
	ldrd	r12,r14,[sp,#336]
#endif
	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
	@ mov	r9,r9,ror#32-7
	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
	eor	r12,r12,r2
#ifndef	__thumb2__
	ldr	r2,[sp,#288]
#endif
	eor	r14,r14,r3
#ifndef	__thumb2__
	ldr	r3,[sp,#288+4]
#else
	ldrd	r2,r3,[sp,#288]
#endif
	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
	ldr	r12,[sp,#444]			@ load counter
	eor	r2,r2,r10
	adr	r10,iotas32
	mov	r4,r14,ror#32-22
	add	r14,r10,r12
	eor	r3,r3,r11
#ifndef	__thumb2__
	ldr	r10,[r14,#8]		@ iotas[i].lo
#endif
	add	r12,r12,#16
#ifndef	__thumb2__
	ldr	r11,[r14,#12]		@ iotas[i].hi
#else
	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
#endif
	cmp	r12,#192
	str	r12,[sp,#444]			@ store counter
	bic	r12,r4,r2,ror#32-22
	bic	r14,r5,r3,ror#32-22
	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
	mov	r3,r3,ror#32-22
	eor	r12,r12,r0
	eor	r14,r14,r1
	eor	r10,r10,r12
	eor	r11,r11,r14
#ifndef	__thumb2__
	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
#endif
	bic	r12,r6,r4,ror#11
#ifndef	__thumb2__
	str	r11,[sp,#0+4]
#else
	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
#endif
	bic	r14,r7,r5,ror#10
	bic	r10,r8,r6,ror#32-(11-7)
	bic	r11,r9,r7,ror#32-(10-7)
	eor	r12,r2,r12,ror#32-11
#ifndef	__thumb2__
	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r3,r14,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#8+4]
#else
	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r10,r4,r10,ror#32-7
	eor	r11,r5,r11,ror#32-7
#ifndef	__thumb2__
	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r12,r0,r8,ror#32-7
#ifndef	__thumb2__
	str	r11,[sp,#16+4]
#else
	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r14,r1,r9,ror#32-7
	eor	r12,r12,r6,ror#32-11
#ifndef	__thumb2__
	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
#endif
	eor	r14,r14,r7,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#24+4]
#else
	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r10,r2,r0
	add	r14,sp,#224
#ifndef	__thumb2__
	ldr	r0,[sp,#264]		@ A[0][3]
#endif
	bic	r11,r3,r1
#ifndef	__thumb2__
	ldr	r1,[sp,#264+4]
#else
	ldrd	r0,r1,[sp,#264]		@ A[0][3]
#endif
	eor	r10,r10,r8,ror#32-7
	eor	r11,r11,r9,ror#32-7
#ifndef	__thumb2__
	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif
	add	r9,sp,#200
#ifndef	__thumb2__
	str	r11,[sp,#32+4]
#else
	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif

	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]

#ifndef	__thumb2__
	ldr	r2,[sp,#312]		@ A[1][4]
#endif
	eor	r0,r0,r10
#ifndef	__thumb2__
	ldr	r3,[sp,#312+4]
#else
	ldrd	r2,r3,[sp,#312]		@ A[1][4]
#endif
	eor	r1,r1,r11
	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
#ifndef	__thumb2__
	ldr	r10,[sp,#368]		@ A[3][1]
#endif
	@ mov	r1,r1,ror#32-14
#ifndef	__thumb2__
	ldr	r11,[sp,#368+4]
#else
	ldrd	r10,r11,[sp,#368]		@ A[3][1]
#endif

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r4,[sp,#320]		@ A[2][0]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r5,[sp,#320+4]
#else
	ldrd	r4,r5,[sp,#320]		@ A[2][0]
#endif
	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
	@ mov	r3,r3,ror#32-10

	eor	r6,r6,r4
#ifndef	__thumb2__
	ldr	r12,[sp,#216]		@ D[2]
#endif
	eor	r7,r7,r5
#ifndef	__thumb2__
	ldr	r14,[sp,#216+4]
#else
	ldrd	r12,r14,[sp,#216]		@ D[2]
#endif
	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
	mov	r4,r7,ror#32-2

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#416]		@ A[4][2]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#416+4]
#else
	ldrd	r8,r9,[sp,#416]		@ A[4][2]
#endif
	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
	mov	r6,r11,ror#32-23

	bic	r10,r4,r2,ror#32-10
	bic	r11,r5,r3,ror#32-10
	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
	mov	r8,r14,ror#32-31
	eor	r10,r10,r0,ror#32-14
	eor	r11,r11,r1,ror#32-14
#ifndef	__thumb2__
	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
#ifndef	__thumb2__
	str	r11,[sp,#40+4]
#else
	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r14,r7,r5
	eor	r12,r12,r2,ror#32-10
#ifndef	__thumb2__
	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r14,r3,ror#32-10
#ifndef	__thumb2__
	str	r14,[sp,#48+4]
#else
	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
	bic	r12,r0,r8,ror#14
	bic	r14,r1,r9,ror#14
	eor	r10,r10,r4
	eor	r11,r11,r5
#ifndef	__thumb2__
	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r2,r2,r0,ror#32-(14-10)
#ifndef	__thumb2__
	str	r11,[sp,#56+4]
#else
	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
#endif
	eor	r12,r6,r12,ror#32-14
	bic	r11,r3,r1,ror#32-(14-10)
#ifndef	__thumb2__
	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
#endif
	eor	r14,r7,r14,ror#32-14
#ifndef	__thumb2__
	str	r14,[sp,#64+4]
#else
	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
#endif
	add	r12,sp,#208
#ifndef	__thumb2__
	ldr	r1,[sp,#248]		@ A[0][1]
#endif
	eor	r10,r8,r2,ror#32-10
#ifndef	__thumb2__
	ldr	r0,[sp,#248+4]
#else
	ldrd	r1,r0,[sp,#248]		@ A[0][1]
#endif
	eor	r11,r9,r11,ror#32-10
#ifndef	__thumb2__
	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r11,[sp,#72+4]
#else
	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif

	add	r9,sp,#224
	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
#ifndef	__thumb2__
	ldr	r2,[sp,#296]		@ A[1][2]
#endif
#ifndef	__thumb2__
	ldr	r3,[sp,#296+4]
#else
	ldrd	r2,r3,[sp,#296]		@ A[1][2]
#endif
	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]

	eor	r1,r1,r10
#ifndef	__thumb2__
	ldr	r4,[sp,#344]		@ A[2][3]
#endif
	eor	r0,r0,r11
#ifndef	__thumb2__
	ldr	r5,[sp,#344+4]
#else
	ldrd	r4,r5,[sp,#344]		@ A[2][3]
#endif
	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r10,[sp,#392]		@ A[3][4]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r11,[sp,#392+4]
#else
	ldrd	r10,r11,[sp,#392]		@ A[3][4]
#endif
	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
#ifndef	__thumb2__
	ldr	r12,[sp,#200]		@ D[0]
#endif
	@ mov	r3,r3,ror#32-3
#ifndef	__thumb2__
	ldr	r14,[sp,#200+4]
#else
	ldrd	r12,r14,[sp,#200]		@ D[0]
#endif

	eor	r4,r4,r6
	eor	r5,r5,r7
	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
	@ mov	r4,r7,ror#32-13		@ [track reverse order below]

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#400]		@ A[4][0]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#400+4]
#else
	ldrd	r8,r9,[sp,#400]		@ A[4][0]
#endif
	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
	mov	r7,r11,ror#32-4

	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
	mov	r9,r14,ror#32-9

	bic	r10,r5,r2,ror#13-3
	bic	r11,r4,r3,ror#12-3
	bic	r12,r6,r5,ror#32-13
	bic	r14,r7,r4,ror#32-12
	eor	r10,r0,r10,ror#32-13
	eor	r11,r1,r11,ror#32-12
#ifndef	__thumb2__
	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r12,r12,r2,ror#32-3
#ifndef	__thumb2__
	str	r11,[sp,#80+4]
#else
	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r14,r14,r3,ror#32-3
#ifndef	__thumb2__
	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
#ifndef	__thumb2__
	str	r14,[sp,#88+4]
#else
	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r10,r10,r5,ror#32-13
	eor	r11,r11,r4,ror#32-12
#ifndef	__thumb2__
	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r12,r0,r8
#ifndef	__thumb2__
	str	r11,[sp,#96+4]
#else
	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r14,r1,r9
	eor	r12,r12,r6
	eor	r14,r14,r7
#ifndef	__thumb2__
	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r10,r2,r0,ror#3
#ifndef	__thumb2__
	str	r14,[sp,#104+4]
#else
	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
#endif
	bic	r11,r3,r1,ror#3
#ifndef	__thumb2__
	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
#endif
	eor	r10,r8,r10,ror#32-3
#ifndef	__thumb2__
	ldr	r0,[sp,#272+4]
#else
	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
#endif
	eor	r11,r9,r11,ror#32-3
#ifndef	__thumb2__
	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif
	add	r9,sp,#208
#ifndef	__thumb2__
	str	r11,[sp,#112+4]
#else
	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif

#ifndef	__thumb2__
	ldr	r10,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r11,[sp,#232+4]
#else
	ldrd	r10,r11,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r12,[sp,#200]		@ D[0]
#endif
#ifndef	__thumb2__
	ldr	r14,[sp,#200+4]
#else
	ldrd	r12,r14,[sp,#200]		@ D[0]
#endif

	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]

	eor	r1,r1,r10
#ifndef	__thumb2__
	ldr	r2,[sp,#280]		@ A[1][0]
#endif
	eor	r0,r0,r11
#ifndef	__thumb2__
	ldr	r3,[sp,#280+4]
#else
	ldrd	r2,r3,[sp,#280]		@ A[1][0]
#endif
	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
#ifndef	__thumb2__
	ldr	r4,[sp,#328]		@ A[2][1]
#endif
	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
#ifndef	__thumb2__
	ldr	r5,[sp,#328+4]
#else
	ldrd	r4,r5,[sp,#328]		@ A[2][1]
#endif

	eor	r2,r2,r12
#ifndef	__thumb2__
	ldr	r10,[sp,#376]		@ A[3][2]
#endif
	eor	r3,r3,r14
#ifndef	__thumb2__
	ldr	r11,[sp,#376+4]
#else
	ldrd	r10,r11,[sp,#376]		@ A[3][2]
#endif
	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
#ifndef	__thumb2__
	ldr	r12,[sp,#224]		@ D[3]
#endif
	@ mov	r3,r3,ror#32-18
#ifndef	__thumb2__
	ldr	r14,[sp,#224+4]
#else
	ldrd	r12,r14,[sp,#224]		@ D[3]
#endif

	eor	r6,r6,r4
	eor	r7,r7,r5
	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
	mov	r5,r7,ror#32-5

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#424]		@ A[4][3]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#424+4]
#else
	ldrd	r8,r9,[sp,#424]		@ A[4][3]
#endif
	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
	mov	r6,r11,ror#32-8

	eor	r12,r12,r8
	eor	r14,r14,r9
	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
	mov	r9,r14,ror#32-28

	bic	r10,r4,r2,ror#32-18
	bic	r11,r5,r3,ror#32-18
	eor	r10,r10,r0,ror#32-14
	eor	r11,r11,r1,ror#32-13
#ifndef	__thumb2__
	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
#ifndef	__thumb2__
	str	r11,[sp,#120+4]
#else
	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r14,r7,r5
	eor	r12,r12,r2,ror#32-18
#ifndef	__thumb2__
	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
#endif
	eor	r14,r14,r3,ror#32-18
#ifndef	__thumb2__
	str	r14,[sp,#128+4]
#else
	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6
	bic	r11,r9,r7
	bic	r12,r0,r8,ror#14
	bic	r14,r1,r9,ror#13
	eor	r10,r10,r4
	eor	r11,r11,r5
#ifndef	__thumb2__
	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
#endif
	bic	r2,r2,r0,ror#18-14
#ifndef	__thumb2__
	str	r11,[sp,#136+4]
#else
	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
#endif
	eor	r12,r6,r12,ror#32-14
	bic	r11,r3,r1,ror#18-13
	eor	r14,r7,r14,ror#32-13
#ifndef	__thumb2__
	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
#endif
#ifndef	__thumb2__
	str	r14,[sp,#144+4]
#else
	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
#endif
	add	r14,sp,#216
#ifndef	__thumb2__
	ldr	r0,[sp,#256]		@ A[0][2]
#endif
	eor	r10,r8,r2,ror#32-18
#ifndef	__thumb2__
	ldr	r1,[sp,#256+4]
#else
	ldrd	r0,r1,[sp,#256]		@ A[0][2]
#endif
	eor	r11,r9,r11,ror#32-18
#ifndef	__thumb2__
	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r11,[sp,#152+4]
#else
	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif

	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
#ifndef	__thumb2__
	ldr	r2,[sp,#304]		@ A[1][3]
#endif
#ifndef	__thumb2__
	ldr	r3,[sp,#304+4]
#else
	ldrd	r2,r3,[sp,#304]		@ A[1][3]
#endif
#ifndef	__thumb2__
	ldr	r6,[sp,#232]		@ D[4]
#endif
#ifndef	__thumb2__
	ldr	r7,[sp,#232+4]
#else
	ldrd	r6,r7,[sp,#232]		@ D[4]
#endif

	eor	r0,r0,r10
#ifndef	__thumb2__
	ldr	r4,[sp,#352]		@ A[2][4]
#endif
	eor	r1,r1,r11
#ifndef	__thumb2__
	ldr	r5,[sp,#352+4]
#else
	ldrd	r4,r5,[sp,#352]		@ A[2][4]
#endif
	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
#ifndef	__thumb2__
	ldr	r8,[sp,#200]		@ D[0]
#endif
	@ mov	r1,r1,ror#32-31
#ifndef	__thumb2__
	ldr	r9,[sp,#200+4]
#else
	ldrd	r8,r9,[sp,#200]		@ D[0]
#endif

	eor	r12,r12,r2
#ifndef	__thumb2__
	ldr	r10,[sp,#360]		@ A[3][0]
#endif
	eor	r14,r14,r3
#ifndef	__thumb2__
	ldr	r11,[sp,#360+4]
#else
	ldrd	r10,r11,[sp,#360]		@ A[3][0]
#endif
	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
#ifndef	__thumb2__
	ldr	r12,[sp,#208]		@ D[1]
#endif
	mov	r2,r14,ror#32-28
#ifndef	__thumb2__
	ldr	r14,[sp,#208+4]
#else
	ldrd	r12,r14,[sp,#208]		@ D[1]
#endif

	eor	r6,r6,r4
	eor	r7,r7,r5
	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
	mov	r4,r7,ror#32-20

	eor	r10,r10,r8
#ifndef	__thumb2__
	ldr	r8,[sp,#408]		@ A[4][1]
#endif
	eor	r11,r11,r9
#ifndef	__thumb2__
	ldr	r9,[sp,#408+4]
#else
	ldrd	r8,r9,[sp,#408]		@ A[4][1]
#endif
	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
	mov	r6,r11,ror#32-21

	eor	r8,r8,r12
	eor	r9,r9,r14
	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
	@ mov	r9,r3,ror#32-1

	bic	r10,r4,r2
	bic	r11,r5,r3
	eor	r10,r10,r0,ror#32-31
#ifndef	__thumb2__
	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
#endif
	eor	r11,r11,r1,ror#32-31
#ifndef	__thumb2__
	str	r11,[sp,#160+4]
#else
	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
#endif
	bic	r12,r6,r4
	bic	r14,r7,r5
	eor	r12,r12,r2
	eor	r14,r14,r3
#ifndef	__thumb2__
	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r10,r8,r6,ror#1
#ifndef	__thumb2__
	str	r14,[sp,#168+4]
#else
	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
#endif
	bic	r11,r9,r7,ror#1
	bic	r12,r0,r8,ror#31-1
	bic	r14,r1,r9,ror#31-1
	eor	r4,r4,r10,ror#32-1
#ifndef	__thumb2__
	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
#endif
	eor	r5,r5,r11,ror#32-1
#ifndef	__thumb2__
	str	r5,[sp,#176+4]
#else
	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
#endif
	eor	r6,r6,r12,ror#32-31
	eor	r7,r7,r14,ror#32-31
#ifndef	__thumb2__
	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
#endif
	bic	r10,r2,r0,ror#32-31
#ifndef	__thumb2__
	str	r7,[sp,#184+4]
#else
	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
#endif
	bic	r11,r3,r1,ror#32-31
	add	r12,sp,#0
	eor	r8,r10,r8,ror#32-1
	add	r10,sp,#40
	eor	r9,r11,r9,ror#32-1
#ifndef	__thumb2__
	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
#endif
#ifndef	__thumb2__
	str	r9,[sp,#192+4]
#else
	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
#endif
	blo	.Lround2x

#if __ARM_ARCH__>=5
	ldr	pc,[sp,#440]
#else
	ldr	lr,[sp,#440]
	tst	lr,#1
	moveq	pc,lr		@ be binary compatible with V4, yet
.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
#endif
.size	KeccakF1600_int,.-KeccakF1600_int

.type	KeccakF1600, %function
.align	5
KeccakF1600:
	stmdb	sp!,{r0,r4-r11,lr}
	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...

	add	r10,r0,#40
	add	r11,sp,#40
	ldmia	r0,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ copy A[5][5] to stack
	stmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	add	r12,sp,#0
	add	r10,sp,#40
	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}

	bl	KeccakF1600_enter

	ldr	r11, [sp,#440+16]		@ restore pointer to A
	ldmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ return A[5][5]
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}

	add	sp,sp,#440+20
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
	tst	lr,#1
	moveq	pc,lr		@ be binary compatible with V4, yet
.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
#endif
.size	KeccakF1600,.-KeccakF1600
.globl	SHA3_absorb
.type	SHA3_absorb,%function
.align	5
SHA3_absorb:
	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
	sub	sp,sp,#456+16

	add	r10,r0,#40
	@ mov	r11,r1
	mov	r12,r2
	mov	r14,r3
	cmp	r2,r3
	blo	.Labsorb_abort

	add	r11,sp,#0
	ldmia	r0,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ copy A[5][5] to stack
	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}

	ldr	r11,[sp,#476]		@ restore r11
#ifdef	__thumb2__
	mov	r9,#0x00ff00ff
	mov	r8,#0x0f0f0f0f
	mov	r7,#0x33333333
	mov	r6,#0x55555555
#else
	mov	r6,#0x11		@ compose constants
	mov	r8,#0x0f
	mov	r9,#0xff
	orr	r6,r6,r6,lsl#8
	orr	r8,r8,r8,lsl#8
	orr	r6,r6,r6,lsl#16		@ 0x11111111
	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
	orr	r7,r6,r6,lsl#1		@ 0x33333333
	orr	r6,r6,r6,lsl#2		@ 0x55555555
#endif
	str	r9,[sp,#468]
	str	r8,[sp,#464]
	str	r7,[sp,#460]
	str	r6,[sp,#456]
	b	.Loop_absorb

.align	4
.Loop_absorb:
	subs	r0,r12,r14
	blo	.Labsorbed
	add	r10,sp,#0
	str	r0,[sp,#480]		@ save len - bsz

.align	4
.Loop_block:
	ldrb	r0,[r11],#1
	ldrb	r1,[r11],#1
	ldrb	r2,[r11],#1
	ldrb	r3,[r11],#1
	ldrb	r4,[r11],#1
	orr	r0,r0,r1,lsl#8
	ldrb	r1,[r11],#1
	orr	r0,r0,r2,lsl#16
	ldrb	r2,[r11],#1
	orr	r0,r0,r3,lsl#24		@ lo
	ldrb	r3,[r11],#1
	orr	r1,r4,r1,lsl#8
	orr	r1,r1,r2,lsl#16
	orr	r1,r1,r3,lsl#24		@ hi

	and	r2,r0,r6		@ &=0x55555555
	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
	and	r3,r1,r6		@ &=0x55555555
	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
	orr	r2,r2,r2,lsr#1
	orr	r0,r0,r0,lsl#1
	orr	r3,r3,r3,lsr#1
	orr	r1,r1,r1,lsl#1
	and	r2,r2,r7		@ &=0x33333333
	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
	and	r3,r3,r7		@ &=0x33333333
	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
	orr	r2,r2,r2,lsr#2
	orr	r0,r0,r0,lsl#2
	orr	r3,r3,r3,lsr#2
	orr	r1,r1,r1,lsl#2
	and	r2,r2,r8		@ &=0x0f0f0f0f
	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
	and	r3,r3,r8		@ &=0x0f0f0f0f
	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
	ldmia	r10,{r4,r5}		@ A_flat[i]
	orr	r2,r2,r2,lsr#4
	orr	r0,r0,r0,lsl#4
	orr	r3,r3,r3,lsr#4
	orr	r1,r1,r1,lsl#4
	and	r2,r2,r9		@ &=0x00ff00ff
	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
	and	r3,r3,r9		@ &=0x00ff00ff
	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
	orr	r2,r2,r2,lsr#8
	orr	r0,r0,r0,lsl#8
	orr	r3,r3,r3,lsr#8
	orr	r1,r1,r1,lsl#8

	mov	r2,r2,lsl#16
	mov	r1,r1,lsr#16
	eor	r4,r4,r3,lsl#16
	eor	r5,r5,r0,lsr#16
	eor	r4,r4,r2,lsr#16
	eor	r5,r5,r1,lsl#16
	stmia	r10!,{r4,r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])

	subs	r14,r14,#8
	bhi	.Loop_block

	str	r11,[sp,#476]

	bl	KeccakF1600_int

	add	r14,sp,#456
	ldmia	r14,{r6,r7,r8,r9,r10,r11,r12,r14}	@ restore constants and variables
	b	.Loop_absorb

.align	4
.Labsorbed:
	add	r11,sp,#40
	ldmia	sp,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ return A[5][5]
	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	ldmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
	stmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}

.Labsorb_abort:
	add	sp,sp,#456+32
	mov	r0,r12			@ return value
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
	tst	lr,#1
	moveq	pc,lr		@ be binary compatible with V4, yet
.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
#endif
.size	SHA3_absorb,.-SHA3_absorb
.globl	SHA3_squeeze
.type	SHA3_squeeze,%function
.align	5
SHA3_squeeze:
	stmdb	sp!,{r0,r3-r10,lr}

	mov	r10,r0
	mov	r4,r1
	mov	r5,r2
	mov	r12,r3

#ifdef	__thumb2__
	mov	r9,#0x00ff00ff
	mov	r8,#0x0f0f0f0f
	mov	r7,#0x33333333
	mov	r6,#0x55555555
#else
	mov	r6,#0x11		@ compose constants
	mov	r8,#0x0f
	mov	r9,#0xff
	orr	r6,r6,r6,lsl#8
	orr	r8,r8,r8,lsl#8
	orr	r6,r6,r6,lsl#16		@ 0x11111111
	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
	orr	r7,r6,r6,lsl#1		@ 0x33333333
	orr	r6,r6,r6,lsl#2		@ 0x55555555
#endif
	stmdb	sp!,{r6,r7,r8,r9}

	mov	r14,r10
	b	.Loop_squeeze

.align	4
.Loop_squeeze:
	ldmia	r10!,{r0,r1}	@ A_flat[i++]

	mov	r2,r0,lsl#16
	mov	r3,r1,lsl#16		@ r3 = r1 << 16
	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
	mov	r1,r1,lsr#16
	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000

	orr	r2,r2,r2,lsl#8
	orr	r3,r3,r3,lsr#8
	orr	r0,r0,r0,lsl#8
	orr	r1,r1,r1,lsr#8
	and	r2,r2,r9		@ &=0x00ff00ff
	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
	and	r0,r0,r9		@ &=0x00ff00ff
	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
	orr	r2,r2,r2,lsl#4
	orr	r3,r3,r3,lsr#4
	orr	r0,r0,r0,lsl#4
	orr	r1,r1,r1,lsr#4
	and	r2,r2,r8		@ &=0x0f0f0f0f
	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
	and	r0,r0,r8		@ &=0x0f0f0f0f
	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
	orr	r2,r2,r2,lsl#2
	orr	r3,r3,r3,lsr#2
	orr	r0,r0,r0,lsl#2
	orr	r1,r1,r1,lsr#2
	and	r2,r2,r7		@ &=0x33333333
	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
	and	r0,r0,r7		@ &=0x33333333
	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
	orr	r2,r2,r2,lsl#1
	orr	r3,r3,r3,lsr#1
	orr	r0,r0,r0,lsl#1
	orr	r1,r1,r1,lsr#1
	and	r2,r2,r6		@ &=0x55555555
	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
	and	r0,r0,r6		@ &=0x55555555
	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa

	orr	r2,r2,r3
	orr	r0,r0,r1

	cmp	r5,#8
	blo	.Lsqueeze_tail
	mov	r1,r2,lsr#8
	strb	r2,[r4],#1
	mov	r3,r2,lsr#16
	strb	r1,[r4],#1
	mov	r2,r2,lsr#24
	strb	r3,[r4],#1
	strb	r2,[r4],#1

	mov	r1,r0,lsr#8
	strb	r0,[r4],#1
	mov	r3,r0,lsr#16
	strb	r1,[r4],#1
	mov	r0,r0,lsr#24
	strb	r3,[r4],#1
	strb	r0,[r4],#1
	subs	r5,r5,#8
	beq	.Lsqueeze_done

	subs	r12,r12,#8		@ bsz -= 8
	bhi	.Loop_squeeze

	mov	r0,r14			@ original r10

	bl	KeccakF1600

	ldmia	sp,{r6,r7,r8,r9,r10,r12}		@ restore constants and variables
	mov	r14,r10
	b	.Loop_squeeze

.align	4
.Lsqueeze_tail:
	strb	r2,[r4],#1
	mov	r2,r2,lsr#8
	subs	r5,r5,#1
	beq	.Lsqueeze_done
	strb	r2,[r4],#1
	mov	r2,r2,lsr#8
	subs	r5,r5,#1
	beq	.Lsqueeze_done
	strb	r2,[r4],#1
	mov	r2,r2,lsr#8
	subs	r5,r5,#1
	beq	.Lsqueeze_done
	strb	r2,[r4],#1
	subs	r5,r5,#1
	beq	.Lsqueeze_done

	strb	r0,[r4],#1
	mov	r0,r0,lsr#8
	subs	r5,r5,#1
	beq	.Lsqueeze_done
	strb	r0,[r4],#1
	mov	r0,r0,lsr#8
	subs	r5,r5,#1
	beq	.Lsqueeze_done
	strb	r0,[r4]
	b	.Lsqueeze_done

.align	4
.Lsqueeze_done:
	add	sp,sp,#24
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
#else
	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
	tst	lr,#1
	moveq	pc,lr		@ be binary compatible with V4, yet
.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
#endif
.size	SHA3_squeeze,.-SHA3_squeeze
#if __ARM_MAX_ARCH__>=7
.fpu	neon

.type	iotas64, %object
.align	5
iotas64:
.quad	0x0000000000000001
.quad	0x0000000000008082
.quad	0x800000000000808a
.quad	0x8000000080008000
.quad	0x000000000000808b
.quad	0x0000000080000001
.quad	0x8000000080008081
.quad	0x8000000000008009
.quad	0x000000000000008a
.quad	0x0000000000000088
.quad	0x0000000080008009
.quad	0x000000008000000a
.quad	0x000000008000808b
.quad	0x800000000000008b
.quad	0x8000000000008089
.quad	0x8000000000008003
.quad	0x8000000000008002
.quad	0x8000000000000080
.quad	0x000000000000800a
.quad	0x800000008000000a
.quad	0x8000000080008081
.quad	0x8000000000008080
.quad	0x0000000080000001
.quad	0x8000000080008008
.size	iotas64,.-iotas64

.type	KeccakF1600_neon, %function
.align	5
KeccakF1600_neon:
	add	r1, r0, #16
	adr	r2, iotas64
	mov	r3, #24			@ loop counter
	b	.Loop_neon

.align	4
.Loop_neon:
	@ Theta
	vst1.64	{q4},  [r0,:64]		@ offload A[0..1][4]
	veor	q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
	vst1.64	{d18}, [r1,:64]		@ offload A[2][4]
	veor	q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
	veor	q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
	veor	d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
	veor	d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
	veor	q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
	veor	q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
	veor	d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
	veor	d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
	veor	d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
	veor	q13, q13, q10		@ C[0..1]^=A[4][0..1]
	veor	q14, q15, q11		@ C[2..3]^=A[4][2..3]
	veor	d25, d25, d24		@ C[4]^=A[4][4]

	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
	vadd.u64	q15, q14, q14		@ C[2..3]<<1
	vadd.u64	d18, d25, d25		@ C[4]<<1
	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
	veor	d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
	veor	q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
	veor	d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
	veor	d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)

	veor	d0,  d0,  d25		@ A[0][0] ^= C[4]
	veor	d1,  d1,  d25		@ A[1][0] ^= C[4]
	veor	d10, d10, d25		@ A[2][0] ^= C[4]
	veor	d11, d11, d25		@ A[3][0] ^= C[4]
	veor	d20, d20, d25		@ A[4][0] ^= C[4]

	veor	d2,  d2,  d26		@ A[0][1] ^= D[1]
	veor	d3,  d3,  d26		@ A[1][1] ^= D[1]
	veor	d12, d12, d26		@ A[2][1] ^= D[1]
	veor	d13, d13, d26		@ A[3][1] ^= D[1]
	veor	d21, d21, d26		@ A[4][1] ^= D[1]
	vmov	d26, d27

	veor	d6,  d6,  d28		@ A[0][3] ^= C[2]
	veor	d7,  d7,  d28		@ A[1][3] ^= C[2]
	veor	d16, d16, d28		@ A[2][3] ^= C[2]
	veor	d17, d17, d28		@ A[3][3] ^= C[2]
	veor	d23, d23, d28		@ A[4][3] ^= C[2]
	vld1.64	{q4},  [r0,:64]		@ restore A[0..1][4]
	vmov	d28, d29

	vld1.64	{d18}, [r1,:64]		@ restore A[2][4]
	veor	q2,  q2,  q13		@ A[0..1][2] ^= D[2]
	veor	q7,  q7,  q13		@ A[2..3][2] ^= D[2]
	veor	d22, d22, d27		@ A[4][2]    ^= D[2]

	veor	q4,  q4,  q14		@ A[0..1][4] ^= C[3]
	veor	q9,  q9,  q14		@ A[2..3][4] ^= C[3]
	veor	d24, d24, d29		@ A[4][4]    ^= C[3]

	@ Rho + Pi
	vmov	d26, d2			@ C[1] = A[0][1]
	vshl.u64	d2,  d3,  #44
	vmov	d27, d4			@ C[2] = A[0][2]
	vshl.u64	d4,  d14, #43
	vmov	d28, d6			@ C[3] = A[0][3]
	vshl.u64	d6,  d17, #21
	vmov	d29, d8			@ C[4] = A[0][4]
	vshl.u64	d8,  d24, #14
	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])

	vshl.u64	d3,  d9,  #20
	vshl.u64	d14, d16, #25
	vshl.u64	d17, d15, #15
	vshl.u64	d24, d21, #2
	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])

	vshl.u64	d9,  d22, #61
	@ vshl.u64	d16, d19, #8
	vshl.u64	d15, d12, #10
	vshl.u64	d21, d7,  #55
	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
	vext.8	d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])

	vshl.u64	d22, d18, #39
	@ vshl.u64	d19, d23, #56
	vshl.u64	d12, d5,  #6
	vshl.u64	d7,  d13, #45
	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
	vext.8	d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])

	vshl.u64	d18, d20, #18
	vshl.u64	d23, d11, #41
	vshl.u64	d5,  d10, #3
	vshl.u64	d13, d1,  #36
	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])

	vshl.u64	d1,  d28, #28
	vshl.u64	d10, d26, #1
	vshl.u64	d11, d29, #27
	vshl.u64	d20, d27, #62
	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])

	@ Chi + Iota
	vbic	q13, q2,  q1
	vbic	q14, q3,  q2
	vbic	q15, q4,  q3
	veor	q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
	veor	q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
	veor	q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
	vst1.64	{q13}, [r0,:64]		@ offload A[0..1][0]
	vbic	q13, q0,  q4
	vbic	q15, q1,  q0
	vmov	q1,  q14		@ A[0..1][1]
	veor	q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
	veor	q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])

	vbic	q13, q7,  q6
	vmov	q0,  q5			@ A[2..3][0]
	vbic	q14, q8,  q7
	vmov	q15, q6			@ A[2..3][1]
	veor	q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
	vbic	q13, q9,  q8
	veor	q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
	vbic	q14, q0,  q9
	veor	q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
	vbic	q13, q15, q0
	veor	q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
	vmov	q14, q10		@ A[4][0..1]
	veor	q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])

	vld1.64	d25, [r2,:64]!		@ Iota[i++]
	vbic	d26, d22, d21
	vbic	d27, d23, d22
	vld1.64	{q0}, [r0,:64]		@ restore A[0..1][0]
	veor	d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
	vbic	d26, d24, d23
	veor	d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
	vbic	d27, d28, d24
	veor	d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
	vbic	d26, d29, d28
	veor	d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
	veor	d0,  d0,  d25		@ A[0][0] ^= Iota[i]
	veor	d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])

	subs	r3, r3, #1
	bne	.Loop_neon

	bx	lr
.size	KeccakF1600_neon,.-KeccakF1600_neon

.globl	SHA3_absorb_neon
.type	SHA3_absorb_neon, %function
.align	5
SHA3_absorb_neon:
	stmdb	sp!, {r4,r5,r6,lr}
	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}

	mov	r4, r1			@ inp
	mov	r5, r2			@ len
	mov	r6, r3			@ bsz

	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
	vld1.32	{d8}, [r0,:64]!		@ A[0][4]

	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
	vld1.32	{d9}, [r0,:64]!		@ A[1][4]

	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
	vld1.32	{d18}, [r0,:64]!		@ A[2][4]

	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
	vld1.32	{d19}, [r0,:64]!		@ A[3][4]

	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..3]
	vld1.32	{d24}, [r0,:64]		@ A[4][4]
	sub	r0, r0, #24*8		@ rewind
	b	.Loop_absorb_neon

.align	4
.Loop_absorb_neon:
	subs	r12, r5, r6		@ len - bsz
	blo	.Labsorbed_neon
	mov	r5, r12

	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
	cmp	r6, #8*2
	veor	d0, d0, d31		@ A[0][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d2, d2, d31		@ A[0][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*4
	veor	d4, d4, d31		@ A[0][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d6, d6, d31		@ A[0][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31},[r4]!
	cmp	r6, #8*6
	veor	d8, d8, d31		@ A[0][4] ^= *inp++
	blo	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	veor	d1, d1, d31		@ A[1][0] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*8
	veor	d3, d3, d31		@ A[1][1] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d5, d5, d31		@ A[1][2] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*10
	veor	d7, d7, d31		@ A[1][3] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d9, d9, d31		@ A[1][4] ^= *inp++
	beq	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	cmp	r6, #8*12
	veor	d10, d10, d31		@ A[2][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d12, d12, d31		@ A[2][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*14
	veor	d14, d14, d31		@ A[2][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d16, d16, d31		@ A[2][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*16
	veor	d18, d18, d31		@ A[2][4] ^= *inp++
	blo	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	veor	d11, d11, d31		@ A[3][0] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*18
	veor	d13, d13, d31		@ A[3][1] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d15, d15, d31		@ A[3][2] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*20
	veor	d17, d17, d31		@ A[3][3] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d19, d19, d31		@ A[3][4] ^= *inp++
	beq	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	cmp	r6, #8*22
	veor	d20, d20, d31		@ A[4][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d21, d21, d31		@ A[4][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*24
	veor	d22, d22, d31		@ A[4][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d23, d23, d31		@ A[4][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d24, d24, d31		@ A[4][4] ^= *inp++

.Lprocess_neon:
	bl	KeccakF1600_neon
	b	.Loop_absorb_neon

.align	4
.Labsorbed_neon:
	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
	vst1.32	{d2}, [r0,:64]!
	vst1.32	{d4}, [r0,:64]!
	vst1.32	{d6}, [r0,:64]!
	vst1.32	{d8}, [r0,:64]!

	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
	vst1.32	{d3}, [r0,:64]!
	vst1.32	{d5}, [r0,:64]!
	vst1.32	{d7}, [r0,:64]!
	vst1.32	{d9}, [r0,:64]!

	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
	vst1.32	{d12}, [r0,:64]!
	vst1.32	{d14}, [r0,:64]!
	vst1.32	{d16}, [r0,:64]!
	vst1.32	{d18}, [r0,:64]!

	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
	vst1.32	{d13}, [r0,:64]!
	vst1.32	{d15}, [r0,:64]!
	vst1.32	{d17}, [r0,:64]!
	vst1.32	{d19}, [r0,:64]!

	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
	vst1.32	{d24}, [r0,:64]

	mov	r0, r5			@ return value
	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
	ldmia	sp!, {r4,r5,r6,pc}
.size	SHA3_absorb_neon,.-SHA3_absorb_neon

.globl	SHA3_squeeze_neon
.type	SHA3_squeeze_neon, %function
.align	5
SHA3_squeeze_neon:
	stmdb	sp!, {r4,r5,r6,lr}

	mov	r4, r1			@ out
	mov	r5, r2			@ len
	mov	r6, r3			@ bsz
	mov	r12, r0			@ A_flat
	mov	r14, r3			@ bsz
	b	.Loop_squeeze_neon

.align	4
.Loop_squeeze_neon:
	cmp	r5, #8
	blo	.Lsqueeze_neon_tail
	vld1.32	{d0}, [r12]!
	vst1.8	{d0}, [r4]!		@ endian-neutral store

	subs	r5, r5, #8		@ len -= 8
	beq	.Lsqueeze_neon_done

	subs	r14, r14, #8		@ bsz -= 8
	bhi	.Loop_squeeze_neon

	vstmdb	sp!,  {d8,d9,d10,d11,d12,d13,d14,d15}

	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
	vld1.32	{d2}, [r0,:64]!
	vld1.32	{d4}, [r0,:64]!
	vld1.32	{d6}, [r0,:64]!
	vld1.32	{d8}, [r0,:64]!

	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
	vld1.32	{d3}, [r0,:64]!
	vld1.32	{d5}, [r0,:64]!
	vld1.32	{d7}, [r0,:64]!
	vld1.32	{d9}, [r0,:64]!

	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
	vld1.32	{d12}, [r0,:64]!
	vld1.32	{d14}, [r0,:64]!
	vld1.32	{d16}, [r0,:64]!
	vld1.32	{d18}, [r0,:64]!

	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
	vld1.32	{d13}, [r0,:64]!
	vld1.32	{d15}, [r0,:64]!
	vld1.32	{d17}, [r0,:64]!
	vld1.32	{d19}, [r0,:64]!

	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
	vld1.32	{d24}, [r0,:64]
	sub	r0, r0, #24*8		@ rewind

	bl	KeccakF1600_neon

	mov	r12, r0			@ A_flat
	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
	vst1.32	{d2}, [r0,:64]!
	vst1.32	{d4}, [r0,:64]!
	vst1.32	{d6}, [r0,:64]!
	vst1.32	{d8}, [r0,:64]!

	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
	vst1.32	{d3}, [r0,:64]!
	vst1.32	{d5}, [r0,:64]!
	vst1.32	{d7}, [r0,:64]!
	vst1.32	{d9}, [r0,:64]!

	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
	vst1.32	{d12}, [r0,:64]!
	vst1.32	{d14}, [r0,:64]!
	vst1.32	{d16}, [r0,:64]!
	vst1.32	{d18}, [r0,:64]!

	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
	vst1.32	{d13}, [r0,:64]!
	vst1.32	{d15}, [r0,:64]!
	vst1.32	{d17}, [r0,:64]!
	vst1.32	{d19}, [r0,:64]!

	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
	mov	r14, r6			@ bsz
	vst1.32	{d24}, [r0,:64]
	mov	r0,  r12		@ rewind

	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
	b	.Loop_squeeze_neon

.align	4
.Lsqueeze_neon_tail:
	ldmia	r12, {r2,r3}
	cmp	r5, #2
	strb	r2, [r4],#1		@ endian-neutral store
	mov	r2, r2, lsr#8
	blo	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	mov	r2, r2, lsr#8
	beq	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	mov	r2, r2, lsr#8
	cmp	r5, #4
	blo	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	beq	.Lsqueeze_neon_done

	strb	r3, [r4], #1
	mov	r3, r3, lsr#8
	cmp	r5, #6
	blo	.Lsqueeze_neon_done
	strb	r3, [r4], #1
	mov	r3, r3, lsr#8
	beq	.Lsqueeze_neon_done
	strb	r3, [r4], #1

.Lsqueeze_neon_done:
	ldmia	sp!, {r4,r5,r6,pc}
.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
#endif
.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align	2
.align	2
